sources_trips = [
'https://s3.amazonaws.com/data-sprints-eng-test/data-sample_data-nyctaxi-trips-2009-json_corrigido.json',
'https://s3.amazonaws.com/data-sprints-eng-test/data-sample_data-nyctaxi-trips-2010-json_corrigido.json',
'https://s3.amazonaws.com/data-sprints-eng-test/data-sample_data-nyctaxi-trips-2011-json_corrigido.json',
'https://s3.amazonaws.com/data-sprints-eng-test/data-sample_data-nyctaxi-trips-2012-json_corrigido.json'
]
vendor_dimension = 'https://s3.amazonaws.com/data-sprints-eng-test/data-vendor_lookup-csv.csv'
payment_lookup = 'https://s3.amazonaws.com/data-sprints-eng-test/data-payment_lookup-csv.csv'
# Importando o pandas e o sqlalchemy para conseguir conectar ao banco
# matplotlib e seaborn para visualização
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
import re
plt.rcParams.update(plt.rcParamsDefault)
%matplotlib inline
plt.rcParams['figure.figsize']=(18,9)
plt.rc('axes', titlesize=36)
sns.set_style("white")
def create_and_append_dataframe_from_json_file(source_filepath, target_dataframe):
dataframe_source = pd.read_json(path_or_buf = source_filepath, lines = True)
df_return = target_dataframe.copy(deep=True)
df_return = df_return.append(dataframe_source, ignore_index=True)
print(f"{source_filepath} appended to target_dataframe")
return df_return
years_to_upload = []
dataframe_fact = pd.DataFrame()
for filepath in sources_trips:
year = int(re.findall("\d{4}",filepath)[0])
if year in years_to_upload:
print(f"{year} already readed and dataframe was created and appended, no action to do!")
pass
else:
years_to_upload.append(year)
dataframe_fact = create_and_append_dataframe_from_json_file(filepath, dataframe_fact)
df_vendor_dimension = pd.read_csv(vendor_dimension)
df_vendor_dimension.head()
df_payment_lookup = pd.read_csv(payment_lookup, header=1)
df_payment_lookup.head()
print(years_to_upload)
dataframe_fact.head()
dataframe_fact.tail()
dataframe_fact.shape
# instanciar banco de dados / Criar conexão banco de dados / Subir dataframes nas tabelas OOOOU Meter um AWS Glue malucão?
years_to_upload.sort()
print(years_to_upload)
def create_date_table(start=f'{years_to_upload[0]}-01-01', end=f'{years_to_upload[-1]}-12-31'):
df = pd.DataFrame({"date": pd.date_range(start, end)})
df["week_day"] = df.date.dt.day_name()
df["day"] = df.date.dt.day
df["month"] = df.date.dt.month
df["week"] = df.date.dt.isocalendar().week
df["quarter"] = df.date.dt.quarter
df["year"] = df.date.dt.year
df.insert(0, 'date_id', (df.year.astype(str) + df.month.astype(str).str.zfill(2) + df.day.astype(str).str.zfill(2)).astype(int))
return df
df_datetime_dimension = create_date_table()
df_datetime_dimension['date'] = pd.to_datetime(df_datetime_dimension['date'],infer_datetime_format=True).dt.date
df_datetime_dimension.head()
dataframe_fact['date'] = pd.to_datetime(dataframe_fact['pickup_datetime'],infer_datetime_format=True).dt.date
dataframe_fact.head()
full_dataframe = pd.merge(dataframe_fact, df_datetime_dimension, on='date', how='inner')
full_dataframe['pickup'] = pd.to_datetime(full_dataframe['pickup_datetime'])
full_dataframe = full_dataframe.set_index(['pickup'])
full_dataframe.head()
full_dataframe.head()
full_dataframe.dtypes
full_dataframe.loc['2009':'2010']
df_payment_lookup.dtypes
# 1. Qual a distância média percorrida por viagens com no máximo 2 passageiros?
media = full_dataframe[full_dataframe['passenger_count'] <= 2]['trip_distance'].mean()
print(round(media,2))
# 2. Quais os 3 maiores vendors em quantidade total de dinheiro arrecadado;
maiores_vendors = full_dataframe.groupby(['vendor_id'])['total_amount'].agg('sum').sort_values(ascending=False)
pd.options.display.float_format = '{:.2f}'.format
maiores_vendors.reset_index()[:3]
full_dataframe_payment = pd.merge(full_dataframe, df_payment_lookup, on='payment_type', how='inner')
full_dataframe_payment.head()
# 3. Faça um histograma da distribuição mensal, nos 4 anos, de corridas pagas em dinheiro;
corridas_dinheiro = full_dataframe_payment[full_dataframe_payment['payment_lookup'] == 'Cash']
corridas_dinheiro.reset_index()
corridas_dinheiro.shape
# corridas_dinheiro.sort_values(by=['year','month'], inplace=True)
# corridas_dinheiro.groupby([corridas_dinheiro["year"], corridas_dinheiro["month"]])['total_amount'].count().plot(kind="bar")
corridas_dinheiro_hist = corridas_dinheiro.groupby(['year', 'month'])['payment_type'].agg('count').reset_index()
corridas_dinheiro_hist
corridas_dinheiro_hist['year_month'] = corridas_dinheiro_hist['year'].astype(str) + '-' + corridas_dinheiro_hist['month'].astype(str)
corridas_dinheiro_hist.pivot('year','month','payment_type').plot.bar()
full_dataframe_payment.dtypes
# 4. Faça um gráfico de série temporal contando a quantidade de gorjetas de cada dia, nos últimos 3 meses de 2012.
full_dataframe_payment['pickup_datetime'] = pd.to_datetime(full_dataframe_payment['pickup_datetime'])
mask = (full_dataframe_payment['pickup_datetime'] > '2012-10-01') & (full_dataframe_payment['pickup_datetime'] <= '2012-12-31')
gorjetas_3m_2012 = full_dataframe_payment.loc[mask]
gorjetas_3m_2012.tail(10)
gorjetas_time_series = gorjetas_3m_2012.groupby(gorjetas_3m_2012['pickup_datetime'].dt.date)['pickup_datetime'].agg('count')
gorjetas_time_series
# fig, axs = plt.subplots(figsize=(12, 4))
gorjetas_time_series.plot(style="-o", figsize=(30, 10))
import matplotlib.dates as mdates
# Plot
plt.figure(figsize=(30,12))
plt.plot(gorjetas_time_series, marker='o', linestyle='-')
ax = plt.gca()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m'))
ax.xaxis.set_major_locator(mdates.DayLocator(interval=1))
plt.gcf().autofmt_xdate()
plt.show()
# Últimos dados são de outubro/2012
full_dataframe_sorted = full_dataframe_payment.sort_values(by=['year','month'])
full_dataframe_sorted.tail()
# Qual o tempo médio das corridas nos dias de sábado e domingo;
weekend_data = full_dataframe_payment[full_dataframe_payment['week_day'].isin(['Saturday', 'Sunday'])]
weekend_data.shape
weekend_data.dtypes
weekend_data['dropoff_datetime'] = pd.to_datetime(weekend_data['dropoff_datetime'])
#weekend_data['time_delta'] = pd.Timedelta(weekend_data['dropoff_datetime'] - weekend_data['pickup_datetime']).seconds / 60.0
weekend_data['total_minutes'] = (weekend_data['dropoff_datetime'] - weekend_data['pickup_datetime']) / pd.Timedelta(minutes=1)
weekend_data.head()
media_corridas_weekend = weekend_data['total_minutes'].mean()
print(f'O tempo médio das corridas nos finais de semanas foi de {round(media_corridas_weekend,2)} minutos.')
# Fazer uma visualização em mapa com latitude e longitude de pickups and dropoffs no ano de 2010;
corridas_2010 = full_dataframe.loc['2010']
corridas_2010.shape
!pip install geopandas
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
corridas_2010.dtypes
import folium
map_1 = folium.Map(location=[40.767937,-73.982155 ],tiles='OpenStreetMap', zoom_start=12)
for each in corridas_2010[:10000].iterrows():
folium.CircleMarker([each[1]['pickup_latitude'],each[1]['pickup_longitude']],
radius=3,
color='blue',
popup=str(each[1]['pickup_latitude'])+','+str(each[1]['pickup_longitude']),
fill_color='#FD8A6C'
).add_to(map_1)
map_1